{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n"
     ]
    }
   ],
   "source": [
    "from scipy import stats\n",
    "import math"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Power analysis using Krippendorff's alpha for inter-rater reliability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$$\n",
    "T\\left(P_c, \\alpha_{\\text {min }}, p\\right)=2 z_p^2\\left(\\frac{\\left(1+\\alpha_{\\text {min }}\\right)\\left(3-\\alpha_{\\text {min }}\\right)}{4\\left(1-\\alpha_{\\text {min }}\\right) P_c\\left(1-P_c\\right)}-\\alpha_{\\text {min }}\\right)\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "level_of_significance = 0.05\n",
    "alpha = 0.8 # effect size, Krippendorff's alpha\n",
    "probability_correct = 0.25 # is 0.25 in a single-choice test with 4 options (25% random correct)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "z = stats.norm.ppf(level_of_significance)\n",
    "# z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "138.5238248496853"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = 2*z**2*(((1+alpha)*(3-alpha))/(4*(1-alpha)*probability_correct*(1-probability_correct))-alpha)\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "23.087304141614215"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t/6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We need to use 139 samples for a single-choice test with 4 options and alpha= 0.8 to achieve a level of significance of 0.05.\n"
     ]
    }
   ],
   "source": [
    "print(f\"We need to use {math.ceil(t)} samples for a single-choice test with {int(1/probability_correct)} options and alpha= {alpha} to achieve a level of significance of {level_of_significance}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.85 # effect size, Krippendorff's alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "186.71255992651797"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = 2*z**2*(((1+alpha)*(3-alpha))/(4*(1-alpha)*probability_correct*(1-probability_correct))-alpha)\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We need to use 187 samples for a single-choice test with 4 options and alpha= 0.85 to achieve a level of significance of 0.05.\n"
     ]
    }
   ],
   "source": [
    "print(f\"We need to use {math.ceil(t)} samples for a single-choice test with {int(1/probability_correct)} options and alpha= {alpha} to achieve a level of significance of {level_of_significance}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Very nice: our thoughtsource dataset creation is a little limited by the open_book_qa test set, which has only 500 examples.\n",
    "# We could of course also use open_book_qa valid, but that needs a little extra work.\n",
    "# So we just go for two rounds, in sum 500 examples\n",
    "# alpha = 0.8 with 167 examples for exploratory analysis Thoughtsource_1000\n",
    "# alpha = 0.9 with 333 examples for hypothesis testing Thoughtsource_2000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "166.66666666666666"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will use a thoughtsource dataset with 1000 samples in sum\n",
    "# too expensive cannot do comparison on single datasets, has to be on whole thoughtsource.\n",
    "1000/6"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
